Spatial cross validation

Explore how spatial cross validation can be used to predict POC from env.

Author

Thelma Panaïotis

source("utils.R")
library(sf)
library(spatialsample)
load("data/04.all_data.Rdata")
df <- df %>% mutate(poc_log = log(poc))

# CV set-up
n_folds <- 10
n_reps <- 1

# Explanatory variables
exp_vars <- df %>% 
  select(
    temperature,
    phosphate,
    silicate,
    alkalinity,
    dic,
    npp,
    oxygen
    ) %>% 
  colnames()

Stratified CV

Cross-validation stratified on quintiles (default) of the response variable.

Prepare data

# Cross-validation, 10 folds, stratified
set.seed(seed)
df_folds <- vfold_cv(df, v = n_folds, strata = poc_log, repeats = n_reps)

# Check POC distribution across folds
dist_folds <- lapply(1:nrow(df_folds), function(i){
  x <- df_folds$splits[[i]]
  
  if (ncol(df_folds) == 2){
    fold <- df_folds$id[[i]]
  } else {
    rep <- df_folds$id[[i]]
    fold <- df_folds$id2[[i]]
    fold <- paste(rep, fold)
  }
  
  bind_rows(
    analysis(x) %>% mutate(split = "analysis"),
    assessment(x) %>% mutate(split = "assessment")
  ) %>% 
  mutate(fold = fold)
})
dist_folds <- do.call(bind_rows, dist_folds)

ggplot(dist_folds) + 
  geom_density(aes(x = poc_log, colour = fold, linetype = split), show.legend = FALSE) +
  facet_wrap(~fold)

ggplot(dist_folds) + 
  geom_density(aes(x = poc_log, colour = fold, linetype = split)) + 
  facet_wrap(~split, nrow = 2)

Analysis (i.e. train) and assessment (i.e. test) distributions are very similar or differ depending on folds. We can expect some dispersion of R².

Train model

# Define a xgboost model with hyperparameters to tune
xgb_spec <- boost_tree(
  trees = tune(),
  tree_depth = tune(),
  min_n = tune(),
  learn_rate = tune()
) %>%
  set_mode("regression") %>%
  set_engine("xgboost", num.threads = n_cores)


## Recipe & Workflow
# Generate formula from list of explanatory variables
xgb_form <- as.formula(paste("poc_log ~ ", paste(c("poc", exp_vars), collapse = " + "), sep = ""))
xgb_rec <- recipe(xgb_form, data = df) %>%
  update_role(poc, new_role = "untransformed outcome")

xgb_wflow <- workflow() %>%
  add_recipe(xgb_rec) %>%
  add_model(xgb_spec)


## Gridsearch
set.seed(seed)
xgb_grid <- grid_latin_hypercube(
  trees(),
  learn_rate(),
  tree_depth(),
  min_n(),
  size = 30
)
doParallel::registerDoParallel()
set.seed(seed)
xgb_res <- tune_grid(
  xgb_wflow,
  resamples = df_folds,
  grid = xgb_grid,
  control = control_grid(save_pred = TRUE)
)

#autoplot(xgb_res)
best_xgb <- select_best(xgb_res, metric = "rmse")  

Evaluate model

## Finalize
final_xgb <- finalize_workflow(
  xgb_wflow,
  best_xgb
)

## Evaluate
final_res_cv <- fit_resamples(final_xgb, df_folds)
collect_metrics(final_res_cv)
# A tibble: 2 × 6
  .metric .estimator  mean     n std_err .config             
  <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
1 rmse    standard   0.330    10  0.0211 Preprocessor1_Model1
2 rsq     standard   0.886    10  0.0139 Preprocessor1_Model1
cv_metrics <- collect_metrics(final_res_cv, summarize = F) %>% 
  filter(.metric == "rsq") %>% 
  mutate(cv_method = "Strat")
ggplot(cv_metrics) + 
  geom_boxplot(aes(x = .metric, y = .estimate, group = .metric)) +
  geom_jitter(aes(x = .metric, y = .estimate, colour = id))

Indeed, some dispersion of R².

Stratified CV, more constrained

Cross-validation stratified on deciles of the response variable.

Prepare data

# Cross-validation, 10 folds, stratified
set.seed(seed)
df_folds <- vfold_cv(df, v = n_folds, strata = poc_log, breaks = 9, repeats = n_reps)

# Check POC distribution across folds
dist_folds <- lapply(1:nrow(df_folds), function(i){
  x <- df_folds$splits[[i]]
  
  if (ncol(df_folds) == 2){
    fold <- df_folds$id[[i]]
  } else {
    rep <- df_folds$id[[i]]
    fold <- df_folds$id2[[i]]
    fold <- paste(rep, fold)
  }
  
  bind_rows(
    analysis(x) %>% mutate(split = "analysis"),
    assessment(x) %>% mutate(split = "assessment")
  ) %>% 
  mutate(fold = fold)
})
dist_folds <- do.call(bind_rows, dist_folds)

ggplot(dist_folds) + 
  geom_density(aes(x = poc_log, colour = fold, linetype = split), show.legend = FALSE) +
  facet_wrap(~fold)

ggplot(dist_folds) + 
  geom_density(aes(x = poc_log, colour = fold, linetype = split)) + 
  facet_wrap(~split, nrow = 2)

Analysis (i.e. train) and assessment (i.e. test) distributions are very similar and coherent across folds. We can expect low dispersion of R².

Train model

# Define a xgboost model with hyperparameters to tune
xgb_spec <- boost_tree(
  trees = tune(),
  tree_depth = tune(),
  min_n = tune(),
  learn_rate = tune()
) %>%
  set_mode("regression") %>%
  set_engine("xgboost", num.threads = n_cores)


## Recipe & Workflow
# Generate formula from list of explanatory variables
xgb_form <- as.formula(paste("poc_log ~ ", paste(c("poc", exp_vars), collapse = " + "), sep = ""))
xgb_rec <- recipe(xgb_form, data = df) %>%
  update_role(poc, new_role = "untransformed outcome")

xgb_wflow <- workflow() %>%
  add_recipe(xgb_rec) %>%
  add_model(xgb_spec)


## Gridsearch
set.seed(seed)
xgb_grid <- grid_latin_hypercube(
  trees(),
  learn_rate(),
  tree_depth(),
  min_n(),
  size = 30
)
doParallel::registerDoParallel()
set.seed(seed)
xgb_res <- tune_grid(
  xgb_wflow,
  resamples = df_folds,
  grid = xgb_grid,
  control = control_grid(save_pred = TRUE)
)

#autoplot(xgb_res)
best_xgb <- select_best(xgb_res, metric = "rmse")  

Evaluate model

## Finalize
final_xgb <- finalize_workflow(
  xgb_wflow,
  best_xgb
)

## Evaluate
final_res_cv <- fit_resamples(final_xgb, df_folds)
collect_metrics(final_res_cv)
# A tibble: 2 × 6
  .metric .estimator  mean     n std_err .config             
  <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
1 rmse    standard   0.335    10  0.0169 Preprocessor1_Model1
2 rsq     standard   0.884    10  0.0119 Preprocessor1_Model1
cvmc_metrics <- collect_metrics(final_res_cv, summarize = F) %>% 
  filter(.metric == "rsq") %>% 
  mutate(cv_method = "Strat dec")
ggplot(cvmc_metrics) + 
  geom_boxplot(aes(x = .metric, y = .estimate, group = .metric)) +
  geom_jitter(aes(x = .metric, y = .estimate, colour = id))

Note

Indeed, low dispersion of R².

Spatial block CV

Spatial cross validation based on block.

Prepare data

# Spatial cross-validation, 10 folds
df_sf <- st_as_sf(df, coords = c("lon", "lat"), crs = 4326)
set.seed(seed)
df_folds <- spatial_block_cv(df_sf, v = n_folds, repeats = n_reps)
autoplot(df_folds)

# Check POC distribution across folds
dist_folds <- lapply(1:nrow(df_folds), function(i){
  x <- df_folds$splits[[i]]
  
  if (ncol(df_folds) == 2){
    fold <- df_folds$id[[i]]
  } else {
    rep <- df_folds$id[[i]]
    fold <- df_folds$id2[[i]]
    fold <- paste(rep, fold)
  }
  
  bind_rows(
    analysis(x) %>% mutate(split = "analysis"),
    assessment(x) %>% mutate(split = "assessment")
  ) %>% 
  mutate(fold = fold)
})
dist_folds <- do.call(bind_rows, dist_folds)

ggplot(dist_folds) + 
  geom_density(aes(x = poc_log, colour = fold, linetype = split), show.legend = FALSE) +
  facet_wrap(~fold)

ggplot(dist_folds) + 
  geom_density(aes(x = poc_log, colour = fold, linetype = split)) + 
  facet_wrap(~split, nrow = 2)

Analysis distributions are very coherent but assessment distributions vary a lot. We can expect higher dispersion of R².

Train model

# Define a xgboost model with hyperparameters to tune
xgb_spec <- boost_tree(
  trees = tune(),
  tree_depth = tune(),
  min_n = tune(),
  learn_rate = tune()
) %>%
  set_mode("regression") %>%
  set_engine("xgboost", num.threads = n_cores)


## Recipe & Workflow
# Generate formula from list of explanatory variables
xgb_form <- as.formula(paste("poc_log ~ ", paste(c("poc", exp_vars), collapse = " + "), sep = ""))
xgb_rec <- recipe(xgb_form, data = df) %>%
  update_role(poc, new_role = "untransformed outcome")

xgb_wflow <- workflow() %>%
  add_recipe(xgb_rec) %>%
  add_model(xgb_spec)


## Gridsearch
set.seed(seed)
xgb_grid <- grid_latin_hypercube(
  trees(),
  learn_rate(),
  tree_depth(),
  min_n(),
  size = 30
)
doParallel::registerDoParallel()
set.seed(seed)
xgb_res <- tune_grid(
  xgb_wflow,
  resamples = df_folds,
  grid = xgb_grid,
  control = control_grid(save_pred = TRUE)
)

best_xgb <- select_best(xgb_res, metric = "rmse")  

Evaluate model

## Finalize
final_xgb <- finalize_workflow(
  xgb_wflow,
  best_xgb
)

## Evaluate
final_res_scv <- fit_resamples(final_xgb, df_folds)
collect_metrics(final_res_scv)
# A tibble: 2 × 6
  .metric .estimator  mean     n std_err .config             
  <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
1 rmse    standard   0.470    10  0.0432 Preprocessor1_Model1
2 rsq     standard   0.724    10  0.0637 Preprocessor1_Model1
sbcv_metrics <- collect_metrics(final_res_scv, summarize = F) %>% 
  filter(.metric == "rsq") %>% 
  mutate(cv_method = "Block")
ggplot(sbcv_metrics) + 
  geom_boxplot(aes(x = .metric, y = .estimate, group = .metric)) +
  geom_jitter(aes(x = .metric, y = .estimate, colour = id))

Note

Indeed, higher dispersion.

Spatial cluster CV

Spatial cross validation based on spatial clusters.

Prepare data

# Spatial cross-validation, 10 folds
df_sf <- st_as_sf(df, coords = c("lon", "lat"), crs = 4326)
set.seed(seed)
df_folds <- spatial_clustering_cv(df_sf, v = n_folds, repeats = n_reps)
autoplot(df_folds)

# Check POC distribution across folds
dist_folds <- lapply(1:nrow(df_folds), function(i){
  x <- df_folds$splits[[i]]
  
  if (ncol(df_folds) == 2){
    fold <- df_folds$id[[i]]
  } else {
    rep <- df_folds$id[[i]]
    fold <- df_folds$id2[[i]]
    fold <- paste(rep, fold)
  }
  
  bind_rows(
    analysis(x) %>% mutate(split = "analysis"),
    assessment(x) %>% mutate(split = "assessment")
  ) %>% 
  mutate(fold = fold)
})
dist_folds <- do.call(bind_rows, dist_folds)

ggplot(dist_folds) + 
  geom_density(aes(x = poc_log, colour = fold, linetype = split), show.legend = FALSE) +
  facet_wrap(~fold)

ggplot(dist_folds) + 
  geom_density(aes(x = poc_log, colour = fold, linetype = split)) + 
  facet_wrap(~split, nrow = 2)

Once again, distribution varies a lot across assessment splits. Here the prediction task should be harder because each assessment fold is not represented at all in its corresponding analysis fold.

Train model

# Define a xgboost model with hyperparameters to tune
xgb_spec <- boost_tree(
  trees = tune(),
  tree_depth = tune(),
  min_n = tune(),
  learn_rate = tune()
) %>%
  set_mode("regression") %>%
  set_engine("xgboost", num.threads = n_cores)


## Recipe & Workflow
# Generate formula from list of explanatory variables
xgb_form <- as.formula(paste("poc_log ~ ", paste(c("poc", exp_vars), collapse = " + "), sep = ""))
xgb_rec <- recipe(xgb_form, data = df) %>%
  update_role(poc, new_role = "untransformed outcome")

xgb_wflow <- workflow() %>%
  add_recipe(xgb_rec) %>%
  add_model(xgb_spec)


## Gridsearch
set.seed(seed)
xgb_grid <- grid_latin_hypercube(
  trees(),
  learn_rate(),
  tree_depth(),
  min_n(),
  size = 30
)
doParallel::registerDoParallel()
set.seed(seed)
xgb_res <- tune_grid(
  xgb_wflow,
  resamples = df_folds,
  grid = xgb_grid,
  control = control_grid(save_pred = TRUE)
)

best_xgb <- select_best(xgb_res, metric = "rmse")  

Evaluate model

## Finalize
final_xgb <- finalize_workflow(
  xgb_wflow,
  best_xgb
)

## Evaluate
final_res_scv <- fit_resamples(final_xgb, df_folds)
collect_metrics(final_res_scv)
# A tibble: 2 × 6
  .metric .estimator  mean     n std_err .config             
  <chr>   <chr>      <dbl> <int>   <dbl> <chr>               
1 rmse    standard   0.626    10  0.0963 Preprocessor1_Model1
2 rsq     standard   0.554    10  0.102  Preprocessor1_Model1
sccv_metrics <- collect_metrics(final_res_scv, summarize = F) %>% 
  filter(.metric == "rsq") %>% 
  mutate(cv_method = "Clust")
ggplot(sccv_metrics) + 
  geom_boxplot(aes(x = .metric, y = .estimate, group = .metric)) +
  geom_jitter(aes(x = .metric, y = .estimate, colour = id))

Note

Even larger dispersion of R².

Overall comparison of R²

cv_metrics %>% 
  bind_rows(cvmc_metrics) %>% 
  bind_rows(sbcv_metrics) %>% 
  bind_rows(sccv_metrics) %>% 
  mutate(cv_method = fct_inorder(cv_method)) %>% 
  ggplot() +
  geom_boxplot(aes(x = cv_method, y = .estimate, group = cv_method))

Note

The solution: nested cross validation!